#Proteomics analysis at the individual protein level setwd('/Users/emmatimminsschiffman/Documents/Dissertation/proteomics/DB post-genome/OT expression comparison') speccounts<-read.csv('all samples spec counts.csv', header=T, row.names=1) #look at frequency of occurrence foa.plots(speccounts) #screen data for insufficient variables #how many proteins are expressed in only half of the oysters? testdata<-drop.var(speccounts, min.fo=8) str(testdata) #35 #which proteins occur in over 95% of the oysters? testdata2<-drop.var(speccounts, max.po=95) str(testdata2) #779 #how many proteins have little variation? testdata3<-drop.var(speccounts, min.cv=5) str(testdata3) #0 #get rid of the rare (expressed in less than half the oysters) and super abundant (expressed in most of the oysters) proteins speccounts2<-drop.var(speccounts,min.fo=8) speccounts3<-drop.var(speccounts2, max.po=95) #this leaves 427 proteins #log(x+1) transformation - need the +1 for 0 values speccounts.1<-(speccounts3+1) speccounts.trans<-data.trans(speccounts.1, method='log', plot=F) #bray curtis matrix speccounts.bc<-vegdist(speccounts.trans, method='bray') speccounts.bc #NMDS speccounts.nmds<-metaMDS(speccounts.trans, distance='bray', k=2, trymax=100, autotransform=F) #stress is low, so 2 dimensions is good nmds.scree(speccounts.trans, distance='bray', k=10, autotransform=F, trymax=20) plot(speccounts.nmds, type='n') text(speccounts.nmds, labels=row.names(speccounts.trans)) #calculate loadings on each axis vec.spec<-envfit(speccounts.nmds$points, speccounts.trans, perm=100) #*0.05=10008811, 10013314, 10007261, 10025805, 10026987, 10007643, 10003923, 10021998, 10008240, 10020454, 10014801, 1005085, 10012796, 1004243, 10020479, 10006472, 10027362, 10012406, 10010975, 10027546, 10020770, 10001845, 10024853, 10000055, 10004733, 10015183, 10006979, 10023886, 10005508, 10008200, 10025760, 10008961, 10023712, 10006329, 10022421, 10000994, 10000033, 10026642, 10028023, 10002631, 10000067, 10007636, 10012096, 10022619, 10015552, 10016641 #**0.01=10014956, 10000591, 10018703, 10010841, 10011108, 10016815, 10002017, 10016941, 10006212, 10006511, 10014919, 10021336, 10016539, 10019610 ordiplot(speccounts.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') plot(vec.spec, p.max=0.01, col='blue', cex=0.5) ordihull(speccounts.nmds, treatment[,3], label=T) #repeat the above except without removing high or low abundance proteins spec.plus1<-(speccounts+1) spec.trans<-data.trans(spec.plus1, method='log', plot=F) speccounts2.nmds<-metaMDS(spec.trans, distance='bray', k=2, trymax=100, autotransform=F) #low stress ordiplot(speccounts2.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') ordihull(speccounts2.nmds, treatment[,3], label=T) vec2.spec<-envfit(speccounts2.nmds$points, spec.trans, perm=100) #output saved as spreadsheet, all proteins NMDS loadings 120712 plot(vec2.spec, p.max=0.01, col='blue', cex=0.5) #only low abundance proteins removed speccounts2.1<-(speccounts2+1) speccounts2.trans<-data.trans(speccounts2.1, method='log', plot=F) speccounts3.nmds<-metaMDS(speccounts2.trans, distance='bray', k=2, trymax=100, autotransform=F) ordiplot(speccounts3.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') ordihull(speccounts3.nmds, treatment[,3], label=F, col='grey47', cex=0.75) vec3.spec<-envfit(speccounts3.nmds$points, speccounts2.trans, perm=100) #output saved as no low abundance NMDS loadings 120712 plot(vec3.spec, p.max=0.01, col='blue', cex=0.5) #low abundance removed and compare just high and low pCO2, no MS spec2.sub<-read.csv('CO2 only samples spec counts.csv', header=T, row.names=1) spec2sub.nmds<-metaMDS(spec2.sub, distance='bray', k=2, trymax=100, autotransform=F) treat.sub<-read.csv('CO2 treatment.csv', header=T, row.names=1) ordiplot(spec2sub.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') ordihull(spec2sub.nmds, treat.sub[,1], label=F, col='grey47') specsub.row<-data.stand(spec2.sub, method='total', margin='row', plot=F) specsub.d<-vegdist(specsub.row, 'bray') co2.anos<-anosim(specsub.d, grouping=treat.sub[,1]) summary(co2.anos) #R=0.01042, p=0.344 #only high abundance proteins removed speccounts4<-drop.var(speccounts, max.po=95) speccounts4.1<-(speccounts4+1) speccounts4.trans<-data.trans(speccounts4.1, method='log', plot=F) speccounts4.nmds<-metaMDS(speccounts4.trans, distance='bray', k=2, trymax=100, autotransform=F) ordiplot(speccounts4.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') vec4.spec<-envfit(speccounts4.nmds$points, speccounts4.trans, perm=100) #output saved as no high abundance NMDS loadings 120712 plot(vec4.spec, p.max=0.01, col='blue', cex=0.5) ordihull(speccounts4.nmds, treatment[,3], label=T) #ANOSIM #read in file of individuals assigned to treatment groups treatment<-read.csv('treatment groupings.csv', header=T, row.names=1) #normalize dataset by row spec.row<-data.stand(speccounts, method='total', margin='row', plot=F) #create dissimilarity matrix using Bray-Curtis coefficient spec.d<-vegdist(spec.row, 'bray') #anosim for pCO2 grouping y.anosim1<-anosim(spec.d, grouping=treatment[,1]) summary(y.anosim1) #R=-0.01786, p=0.571 plot.anosim(y.anosim1) #anosim for MS grouping y.anosim2<-anosim(spec.d, grouping=treatment[,2]) summary(y.anosim2) #R=-0.024, p=0.563 plot.anosim(y.anosim2) #anosim for all treatments y.anosim3<-anosim(spec.d, treatment[,3]) summary(y.anosim3) #R=-0.01302, p=0.556 plot.anosim(y.anosim3) #ANOSIM with low abundance proteins removed spec2.row<-data.stand(speccounts2, method='total', margin='row', plot=F) spec2.d<-vegdist(spec2.row, 'bray') spec2.anos<-anosim(spec2.d, grouping=treatment[,3]) summary(spec2.anos) #R=-0.01823, p=0.57 #PROTEOMICS ANALYSIS AT GO LEVEL #BIOLOGICAL PROCESSES spec.go<-read.csv('GO processes spec counts.csv', header=T, row.names=1) #log(x+1) transformation - need the +1 for 0 values spec.go.1<-(spec.go+1) specgo.trans<-data.trans(spec.go.1, method='log', plot=F) #NMDS specgo.nmds<-metaMDS(specgo.trans, distance='bray', k=2, trymax=100, autotransform=F) #stress is low, so 2 dimensions is good nmds.scree(specgo.trans, distance='bray', k=10, autotransform=F, trymax=20) plot(specgo.nmds, type='n') text(specgo.nmds, labels=row.names(specgo.trans)) #calculate loadings on each axis vec.specgo<-envfit(specgo.nmds$points, specgo.trans, perm=100) #saved as sheet in workbook ordiplot(specgo.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') plot(vec.specgo, p.max=0.01, col='blue', cex=0.5) ordihull(specgo.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue')) #anosim specgo.row<-data.stand(spec.go, method='total', margin='row', plot=F) specgo.d<-vegdist(specgo.row, 'bray') #pCO2 go.anosim1<-anosim(specgo.d, treatment[,1]) summary(go.anosim1) #R=0.1473, p=0.029 #MS go.anosim2<-anosim(specgo.d, treatment[,2]) summary(go.anosim2) #R=-0.0106, p=0.504 #both go.anosim3<-anosim(specgo.d, treatment[,3]) summary(go.anosim3) #R=0.1562, p=0.05 plot.anosim(go.anosim1) plot.anosim(go.anosim3) #CELLULAR COMPONENTS cc.go<-read.csv('GO components spec counts.csv', header=T, row.names=1) cc.go.1<-(cc.go+1) cc.go.trans<-data.trans(cc.go.1, method='log', plot=F) #NMDS ccgo.nmds<-metaMDS(cc.go.trans, distance='bray', k=2, trymax=100, autotransform=F) vec.ccgo<-envfit(ccgo.nmds$points, cc.go.trans, perm=100) ordiplot(ccgo.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') plot(vec.ccgo, p.max=0.01, col='blue', cex=0.5) ordihull(ccgo.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue')) #anosim ccgo.row<-data.stand(cc.go, method='total', margin='row', plot=F) ccgo.d<-vegdist(ccgo.row, 'bray') #pCO2 cc.anosim1<-anosim(ccgo.d, treatment[,1]) summary(cc.anosim1) #R=0.000558, p=0.424 #MS cc.anosim2<-anosim(ccgo.d, treatment[,2]) summary(cc.anosim2) #R=0.007254, p=0.378 #both cc.anosim3<-anosim(ccgo.d, treatment[,3]) summary(cc.anosim3) #R=0.04861, p=0.248 #MOLECULAR FUNCTION mf.go<-read.csv('GO function spec counts.csv', header=T, row.names=1) mf.go.1<-(mf.go+1) mf.go.trans<-data.trans(mf.go.1, method='log', plot=F) #NMDS mfgo.nmds<-metaMDS(mf.go.trans, distance='bray', k=2, trymax=100, autotransform=F) vec.mfgo<-envfit(mfgo.nmds$points, mf.go.trans, perm=100) ordiplot(mfgo.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') plot(vec.mfgo, p.max=0.01, col='blue', cex=0.5) ordihull(mfgo.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue')) #anosim mfgo.row<-data.stand(mf.go, method='total', margin='row', plot=F) mfgo.d<-vegdist(mfgo.row, 'bray') #pCO2 mf.anosim1<-anosim(mfgo.d, treatment[,1]) summary(mf.anosim1) #R=-0.04129, p=0.749 #MS mf.anosim2<-anosim(mfgo.d, treatment[,2]) summary(mf.anosim2) #R=0.02455, p=0.338 #both mf.anosim3<-anosim(mfgo.d, treatment[,3]) summary(mf.anosim3) #R=0.02865, p=0.333 #PROTEOMICS ANALYSIS AT GO SLIM LEVEL #BIOLOGICAL PROCESSES bpslim<-read.csv('GO Slim processes spec counts.csv', header=T, row.names=1) bpslim.trans<-data.trans(bpslim, method='log', plot=F) #NMDS bpslim.nmds<-metaMDS(bpslim.trans, distance='bray', k=2, trymax=100, autotransform=F) vec.bpslim<-envfit(bpslim.nmds$points, bpslim.trans, perm=100) ordiplot(bpslim.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2') plot(vec.bpslim, p.max=0.01, col='blue', cex=0.5) ordihull(bpslim.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue')) #anosim bpslim.row<-data.stand(bpslim, method='total', margin='row', plot=F) bpslim.d<-vegdist(bpslim.row, 'bray') #pCO2 bpslim.anosim1<-anosim(bpslim.d, treatment[,1]) summary(bpslim.anosim1) #R=0.1886, p=0.008 plot.anosim(bpslim.anosim1) #MS bpslim.anosim2<-anosim(bpslim.d, treatment[,2]) summary(bpslim.anosim2) #R=-0.01004, p=0.49 #both bpslim.anosim3<-anosim(bpslim.d, treatment[,3]) summary(bpslim.anosim3) #R=0.2691, p=0.003 plot.anosim(bpslim.anosim3)